Preprocessing Dataset
Paths
Set the path of the dataframe file.
path_import = "data/original.csv"
path_export = "data/preprocessed.csv"Libraries
Library for read dataset.
library(readr)Library for data frames processing.
library(dplyr)
library(tidyr)Library for R Markdown.
library(rmarkdown)
library(knitr)Library for data presentation.
library(scales)Library for manage strings.
library(stringr)Load dataset
Load dataset.
df <- read_csv(path_import)Statistics
Dimensions.
dim(df)## [1] 183 447
Types
View witch types are in the dataset.
col_types_all <-
df %>%
sapply(typeof) %>%
unlist()
col_types_table <-
col_types_all %>%
table()
col_types <-
col_types_table %>%
as.vector()
names(col_types) <- names(col_types_table)## character double logical
## 158 204 85
As can be seen there are the three expected types: character, double and logical.
NA
Percentaje of NA values
Define function to see the amount of NA values in the dataframe.
percent_of_NA <-
function(df){
num_of_NA <-
df %>% is.na() %>% sum()
num_of_values <-
df %>% dim() %>% prod()
percent_of_NA <-
(num_of_NA / num_of_values) %>%
percent()
return(percent_of_NA)
}percent_of_NA(df)## [1] "37%"
Columns with NA
Define functions to see the NA in columns.
num_of_NA_by_column <-
function(df){
df %>% is.na() %>% colSums()
}remove_0 <-
function(x) x[x!=0]names_of_colums_with_NA <-
function(df)
df %>%
num_of_NA_by_column() %>%
remove_0 %>%
names()percentaje_of_cols_with_NA <-
function(df)
(length(names_of_colums_with_NA(df)) / ncol(df)) %>%
percent()Compute the percentaje of cols with NA.
percentaje_of_cols_with_NA(df)## [1] "74%"
Inspect if there are columns full of NA.
is_full_of_NA <- function(col){
num_of_NA <-
col %>%
is.na() %>%
sum()
return(num_of_NA == length(col))
}cols_full_of_NA <-
df %>%
select_if(is_full_of_NA) %>%
names()## [1] "authentihash" "scans.Bkav.result"
## [3] "scans.CMC.result" "scans.ALYac.result"
## [5] "scans.Malwarebytes.result" "scans.K7AntiVirus.result"
## [7] "scans.Baidu.result" "scans.SUPERAntiSpyware.result"
## [9] "scans.Gridinsoft.result" "scans.ViRobot.result"
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"
## [13] "scans.VBA32.result" "scans.Zoner.result"
## [15] "scans.Panda.result" "scans.Elastic.result"
## [17] "scans.Cylance.result" "scans.SentinelOne.result"
As can be seen there are many columns that are full of NA, so can be deleted.
df <-
select(df, -all_of(cols_full_of_NA))Colums with the same value
Maybe there are columns that has the same value along all the vector, so are useless.
Define function to remove these columns.
different_values <-
function(x)
x %>% na.omit() %>% unique() %>% length()remove_columns_with_the_same_value <-
function(df)
select_if(df, function(col) different_values(col) > 1)Apply function.
num_of_cols_after_remove <-
df %>%
remove_columns_with_the_same_value() %>%
ncol()Calculate the number of columns with same value.
ncol(df) - num_of_cols_after_remove## [1] 147
Awesom! Many colums found. Let’s remove them.
df <-
remove_columns_with_the_same_value(df)Inspecting dataframe
Now let’s deeply inspect into the dataframe.
View dataframe
View dataframe.
Renaming
The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.
df <-
df %>%
rename(n = ...1, json=..JSON)Removing cols
There are many duplicated cols, hashes & dates that can be removed, also many useless.
Dates
There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.
Define a function for check if a col is of type Date.
not <-
function(x) !x
get_element <-
function(x, index) x[index]
is_date_col <-
function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
col %>%
as.character() %>%
na.omit() %>%
get_element(1) %>%
str_detect(pattern)Columns detected.
df %>%
select_if(is_date_col) %>%
head() %>%
paged_table()Define function for remove cols by a predicate.
remove_col_if <-
function(df, fun){
cols_to_delete <-
df %>%
select_if(fun) %>%
colnames()
df <-
df %>%
select(-cols_to_delete)
return(df)
}Remove them.
df <-
remove_col_if(df, is_date_col)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Hashes
There are many hashes cols that don’t really provide useful information. So remove them.
hashes <-
c("")
df <-
df %>%
select(
-vhash,
-sha256,
-sha1,
-scan_id,
-ssdeep,
-md5,
-additional_info.androguard.certificate.serialnumber,
-additional_info.androguard.certificate.thumbprint,
-additional_info.exiftool.ZipCRC
)Scans
There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.
Get the best col.
scans_col_names <-
df %>%
colnames() %>%
str_match_all("scans.[:alpha:]*.result") %>%
unlist()
scan_na_by_col <-
df %>%
select(all_of(scans_col_names)) %>%
num_of_NA_by_column()
scan_col_witch_min_na <-
scan_na_by_col %>%
which.min()
best_scanner_colname <-
scan_na_by_col %>%
names() %>%
get_element(scan_col_witch_min_na)
best_scanner_col <-
df %>%
select(all_of(best_scanner_colname))
best_scanner_name <-
best_scanner_colname %>%
str_split("[.]") %>%
unlist() %>%
get_element(2)## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"
Drop all scans but the best.
col_index_scanners <-
df %>%
colnames() %>%
str_detect("scans") %>%
unlist() %>%
which()
df <-
df %>%
select(-all_of(col_index_scanners)) %>%
cbind(best_scanner_col)Individual columns
Reasons:
- json column contains all the row as JSON.
- permalink is the URL where Virus Total has the virus file.
- Main.Activity & Package are strings with all different values.
- FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
- ZipBitFlag doesn’t seems to matter.
- additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
- Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <-
df %>%
select(
-json,
-permalink,
-additional_info.androguard.AndroidApplicationInfo,
-additional_info.androguard.Main.Activity,
-additional_info.exiftool.MIMEType,
-additional_info.exiftool.FileTypeExtension,
-additional_info.exiftool.ZipFileName,
-additional_info.magic,
-additional_info.androguard.Package,
-additional_info.androguard.certificate.Subject.DN,
-additional_info.compressedview.uncompressed_size
)Groups of columns
Define a function to remove cols which name match a pattern.
remove_cols_which_name_match <-
function(df, pattern){
cols_to_remove <-
df %>%
colnames() %>%
str_which(pattern)
df_removed_cols <-
df %>%
select(-all_of(cols_to_remove))
return(df_removed_cols)
}Remove groups.
Reasons:
- Issuer group has the same information as Subject group.
- CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <-
df %>%
remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")View results
Define a function for sort columns.
sort_cols <-
function(df){
df <-
df %>%
select(order(colnames(df)))
additionalInfo_cols_logical <-
df %>%
colnames() %>%
str_detect("additional_info")
additionalInfo_cols <-
df %>%
select(which(additionalInfo_cols_logical))
not_additionalInfo_cols <-
df %>%
select(which(!additionalInfo_cols_logical)) %>%
select(n, size, everything())
return(cbind(not_additionalInfo_cols, additionalInfo_cols))
}Sort columns.
df <-
sort_cols(df)View results.
Replacing values
Replace “Unknown” and “?” by NA
There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.
Define a function to replace values in cols that satisfy a predicate.
replace_when <-
function(df, fun, value, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace(col, which(col==value), replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}Replace ? and Unknown for NAs.
df <-
df %>%
replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>%
replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)Replace NA for 0
Define functions
Define functions for replace NAs.
replace_na_which_colname_match <-
function(df, pattern, replacement){
cols_to_replace <-
df %>%
colnames() %>%
str_which(pattern)
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}replace_na_when <-
function(df, fun, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}Indiviual columns
The columns AndroidApplication and APK.SHARED.LIBRARIES shoud have 0s insteas of NAs.
df <-
df %>%
replace_na_which_colname_match("AndroidApplication", 0) %>%
replace_na_which_colname_match("APK.SHARED.LIBRARIES", 0) Groups of colums
The permissions (PERM) and the file types (file_types) groups of columns, seems that there are NAs where there should be 0s. So it would be better to replace them.
df <-
df %>%
replace_na_which_colname_match("PERM", 0) %>%
replace_na_which_colname_match("file_types", 0)View results
Sort columns.
df <-
sort_cols(df)View results.
New & modifiead colums
There are columns that must be only one, others provide more information by operating two columns, or summarises information.
New total permissions column
Create a new column that sums all permissions of permissions columns.
pattern <- "additional_info.androguard.RiskIndicator.PERM"
df_without_permissions <-
df %>%
select(., -(str_which(colnames(.), pattern)))
df_permissions <-
df %>%
select(., str_which(colnames(.), pattern)) %>%
mutate(., total_PERMs = rowSums(.))
df <- cbind(df_without_permissions, df_permissions)Merge both MP3 columns into one
There are two mp3 extension columns, one for .MP3 and the other for .mp3, both are mp3 files. The others columns has the name of the extension in upper case, so let’s sum both into the MP3 column.
mp3_cols_logical <-
df %>%
colnames() %>%
tolower() %>%
str_detect("mp3")
mp3_upper_colname <-
colnames(df)[which(mp3_cols_logical)] %>%
str_match("^.*MP3$") %>%
unlist() %>%
na.omit()sprintf("MP3 colname: %s", mp3_upper_colname)## [1] "MP3 colname: additional_info.compressedview.file_types.MP3"
Sum them into additional_info.compressedview.file_types.MP3
mp3_col <-
df %>%
select(which(mp3_cols_logical)) %>%
rowSums(na.rm = TRUE)
df <-
df %>%
select(-which(mp3_cols_logical)) %>%
mutate(additional_info.compressedview.file_types.MP3 = mp3_col)Express increased uncompressed size as percentage
There are two columns that refers to ZIP size, one for the compressed size and the other for the uncompressed size. It would be easier to compare them with the percentage of size increased after decompressed it.
increased_size_after_unzip <-
df$additional_info.exiftool.ZipUncompressedSize /
df$additional_info.exiftool.ZipCompressedSize
df <-
df %>%
select(-additional_info.exiftool.ZipUncompressedSize,
-additional_info.exiftool.ZipCompressedSize) %>%
mutate(additional_info.exiftool.ZipIncreasedUncompressedSize = increased_size_after_unzip)Suspicious heuristic as logical
The column additional_info.trendmicro.housecall.heuristic has a suspicious flag for some rows, the others are NA.
- So will be better to make this column logical:
- NA -> FALSE
- Suspicious -> True
- Also rename it to make it more understandable:
- additional_info.trendmicro.housecall.heuristic -> additional_info.suspicious
suspicious <-
df$additional_info.trendmicro.housecall.heuristic %>%
is.na() %>%
not()
df <-
df %>%
select(-additional_info.trendmicro.housecall.heuristic) %>%
mutate(additional_info.suspicious = suspicious)Decompose unpacker
he column additional_info.f.prot.unpacker has two possible values: “appended” and “UTF-8”, and when both happen they are combined into “appended, UTF-8”. So will be better to split this column into one logical for each of them.
Compute columns.
unpacker_appended <-
df$additional_info.f.prot.unpacker %>%
sapply(function(str) str_detect(str, "appended")) %>%
replace_na(FALSE)
unpacker_utf8 <-
df$additional_info.f.prot.unpacker %>%
sapply(function(str) str_detect(str, "UTF-8")) %>%
replace_na(FALSE)Add them to dataframe.
df <-
df %>%
select(-additional_info.f.prot.unpacker) %>%
mutate(additional_info.f.prot.unpacker.appended = unpacker_appended,
additional_info.f.prot.unpacker.UTF8 = unpacker_utf8)Decompose trid
The column additional_info.trid has percentages of archives.
df$additional_info.trid[1]## [1] "Android Package (57.0%)\r\nJava Archive (20.0%)\r\nSweet Home 3D design (generic) (15.5%)\r\nZIP compressed archive (5.9%)\r\nPrintFox/Pagefox bitmap (640x800) (1.4%)"
Will be better to decompose this column into a column with the percentage for each value.
Extract values and percentages
trid_split_lines <-
df$additional_info.trid %>%
sapply(function(str) str_split(str, "\n"))
trid_split_lines[[3]]## [1] "Java Archive (72.9%)\r"
## [2] "ZIP compressed archive (21.6%)\r"
## [3] "PrintFox/Pagefox bitmap (640x800) (5.4%)"
trid_split_value_percentage <-
trid_split_lines %>%
sapply(function(row)
sapply(row, function(str)
str_split(str, " \\(|%\\)")))
trid_split_value_percentage[[1]][[3]]## [1] "Sweet Home 3D design" "generic)" "15.5"
## [4] "\r"
trid_values <-
trid_split_value_percentage %>%
sapply(function(row)
sapply(row, function(value_percentage)
get_element(value_percentage, 1)))
trid_values[[1]]## Android Package (57.0%)\r
## "Android Package"
## Java Archive (20.0%)\r
## "Java Archive"
## Sweet Home 3D design (generic) (15.5%)\r
## "Sweet Home 3D design"
## ZIP compressed archive (5.9%)\r
## "ZIP compressed archive"
## PrintFox/Pagefox bitmap (640x800) (1.4%)
## "PrintFox/Pagefox bitmap"
trid_percentage <-
trid_split_value_percentage %>%
sapply(function(row)
sapply(row, function(value_percentage)
as.double(get_element(value_percentage,
length(value_percentage) - 1))))
trid_percentage[[1]]## Android Package (57.0%)\r
## 57.0
## Java Archive (20.0%)\r
## 20.0
## Sweet Home 3D design (generic) (15.5%)\r
## 15.5
## ZIP compressed archive (5.9%)\r
## 5.9
## PrintFox/Pagefox bitmap (640x800) (1.4%)
## 1.4
Define a function to convert the name to a proper trid colname.
trid_to_colname <-
function(name)
name %>%
str_to_title() %>%
str_match_all("[:alpha:]*") %>%
unlist() %>%
paste0(collapse = " ") %>%
str_remove_all(" ") %>%
sapply(function(name) paste("additional_info.trid", name, sep = "."))Define function for rename the percentages.
rename_as_value <-
function(v){
values_names <-
names(v) %>%
sapply(function(name) str_split(name, " \\(")) %>%
sapply(function(name_split) get_element(name_split, 1)) %>%
sapply(trid_to_colname)
names(v) <- values_names
return(v)
}
trid_percentage[[1]] %>%
rename_as_value()## additional_info.trid.AndroidPackage
## 57.0
## additional_info.trid.JavaArchive
## 20.0
## additional_info.trid.SweetHomedDesign
## 15.5
## additional_info.trid.ZipCompressedArchive
## 5.9
## additional_info.trid.PrintfoxPagefoxBitmap
## 1.4
Rename the percentages.
trid_percentage_names_as_value <-
trid_percentage %>%
sapply(rename_as_value)
trid_percentage_names_as_value[[1]]## additional_info.trid.AndroidPackage
## 57.0
## additional_info.trid.JavaArchive
## 20.0
## additional_info.trid.SweetHomedDesign
## 15.5
## additional_info.trid.ZipCompressedArchive
## 5.9
## additional_info.trid.PrintfoxPagefoxBitmap
## 1.4
Create columns
Get columns names.
trid_labels <-
trid_values %>%
unlist() %>%
unique()
trid_labels## [1] "Android Package" "Java Archive"
## [3] "Sweet Home 3D design" "ZIP compressed archive"
## [5] "PrintFox/Pagefox bitmap" "Opera Widget"
## [7] "Mozilla Archive Format" "Dalvik Dex class"
## [9] "OpenOffice Extension" "VYM Mind Map"
## [11] "Mozilla Firefox browser extension" "Konfabulator widget"
trid_values_colnames <-
trid_labels %>%
sapply(trid_to_colname)
trid_values_colnames## Android Package.AndroidPackage
## "additional_info.trid.AndroidPackage"
## Java Archive.JavaArchive
## "additional_info.trid.JavaArchive"
## Sweet Home 3D design.SweetHomedDesign
## "additional_info.trid.SweetHomedDesign"
## ZIP compressed archive.ZipCompressedArchive
## "additional_info.trid.ZipCompressedArchive"
## PrintFox/Pagefox bitmap.PrintfoxPagefoxBitmap
## "additional_info.trid.PrintfoxPagefoxBitmap"
## Opera Widget.OperaWidget
## "additional_info.trid.OperaWidget"
## Mozilla Archive Format.MozillaArchiveFormat
## "additional_info.trid.MozillaArchiveFormat"
## Dalvik Dex class.DalvikDexClass
## "additional_info.trid.DalvikDexClass"
## OpenOffice Extension.OpenofficeExtension
## "additional_info.trid.OpenofficeExtension"
## VYM Mind Map.VymMindMap
## "additional_info.trid.VymMindMap"
## Mozilla Firefox browser extension.MozillaFirefoxBrowserExtension
## "additional_info.trid.MozillaFirefoxBrowserExtension"
## Konfabulator widget.KonfabulatorWidget
## "additional_info.trid.KonfabulatorWidget"
Create an empty tibble with colnames set.
df_trid <-
matrix(nrow = nrow(df),
ncol = length(trid_values_colnames)) %>%
as_tibble(.name_repair = ~ trid_values_colnames) %>%
mutate_each(as.double)Colnames.
df_trid %>%
colnames()## [1] "additional_info.trid.AndroidPackage"
## [2] "additional_info.trid.JavaArchive"
## [3] "additional_info.trid.SweetHomedDesign"
## [4] "additional_info.trid.ZipCompressedArchive"
## [5] "additional_info.trid.PrintfoxPagefoxBitmap"
## [6] "additional_info.trid.OperaWidget"
## [7] "additional_info.trid.MozillaArchiveFormat"
## [8] "additional_info.trid.DalvikDexClass"
## [9] "additional_info.trid.OpenofficeExtension"
## [10] "additional_info.trid.VymMindMap"
## [11] "additional_info.trid.MozillaFirefoxBrowserExtension"
## [12] "additional_info.trid.KonfabulatorWidget"
Types.
df_trid %>%
sapply(is.double) %>%
sum() == ncol(df_trid)## [1] TRUE
Insert values.
for(row_index in 1:nrow(df)){
row <- trid_percentage_names_as_value[[row_index]]
for(percentage_index in 1:length(row)){
percentage <- row[percentage_index]
colname <- names(percentage)
df_trid[row_index, colname] <- percentage
}
}Replace NA for 0.
df_trid <-
df_trid %>%
replace(is.na(.), 0)Merge dataframes
Finally merge df and df_trid into one.
df <-
df %>%
select(-additional_info.trid) %>%
cbind(df_trid)View results
Sort columns.
df <-
sort_cols(df)View results.
Save dataframe
After all preprocessing let’s save it into CSV.
write.csv(df, path_export)Functions for preprocessing
As factor
labels <-
function(n){
if(n == 5){
return(c("very low", "low", "medium", "high", "very high"))
}else if(n == 4){
return(c("very low", "low", "high", "very high"))
}else if(n == 3){
return(c("low", "medium", "high"))
}else if(n == 2){
return(c("low", "high"))
}else{
stop("Not avalible")
}
}
cut_by_quantiles <-
function(col){
quantiles <-
col %>%
quantile(na.rm = TRUE) %>%
unique()
if(length(quantiles) > 2){
col <-
col %>%
cut(breaks = quantiles,
labels = labels(length(quantiles)-1),
include.lowest = TRUE)
}
return(col)
}
df_cut_by_quantiles <-
function(df){
df_without_numeric <-
df[sapply(df, function(col) !is.numeric(col))]
df_numeric <-
df %>%
select_if(is.numeric)
df_numeric <-
df_numeric %>%
lapply(cut_by_quantiles)
return(cbind(df_without_numeric, df_numeric))
}